library(tidyverse)
library(ggdist)
library(ggside)
library(easystats)
library(patchwork)
folder <- rstudioapi::selectDirectory(
caption = "Select Directory",
label = "Select"
)
df <- read.csv(paste0(folder, '/9_analysis/Online Study/data/preprocessed.csv')) %>% select(-X)
nrow(df)
## [1] 12240
length(unique(df$Participant))
## [1] 408
df <- df %>%
rename("trial_duration" = Video_Time)
names(df)
## [1] "Video" "Participant"
## [3] "Restoration" "Presence"
## [5] "WillingnessToWalk" "Beauty"
## [7] "Structure" "Interest"
## [9] "Familiarity" "Scenic"
## [11] "Crowdedness" "Width"
## [13] "Valence" "Arousal"
## [15] "trial_duration" "Distracted_Time"
## [17] "stim_screen_1" "stim_screen_2"
## [19] "stim_screen_3" "stim_screen_4"
## [21] "stim_screen_5" "stim_screen_6"
## [23] "stim_screen_7" "stim_screen_8"
## [25] "stim_screen_9" "stim_screen_10"
## [27] "SSA_Mean" "covid_1"
## [29] "covid_2" "covid_3"
## [31] "covid_4" "Concern_Covid_Mean"
## [33] "sias1" "sias2"
## [35] "sias3" "sias4"
## [37] "sias5" "sias6"
## [39] "SIAS_Total_Mean" "sps1"
## [41] "sps2" "sps3"
## [43] "sps4" "sps5"
## [45] "sps6" "SPS_Total_Mean"
## [47] "Extraversion_1" "Agreeableness_2"
## [49] "Conscientiousness_3" "Neuroticism_4"
## [51] "Openness_5" "Conscientiousness_10"
## [53] "Agreeableness_14" "Neuroticism_16"
## [55] "Extraversion_23" "HonestyHumility_6_R"
## [57] "Extraversion_7_R" "Agreeableness_8_R"
## [59] "Openness_9_R" "Conscientiousness_11_R"
## [61] "HonestyHumility_12_R" "Openness_13_R"
## [63] "Neuroticism_15_R" "Neuroticism_17_R"
## [65] "HonestyHumility_18_R" "Extraversion_19_R"
## [67] "Agreeableness_20_R" "Openness_21_R"
## [69] "Conscientiousness_22_R" "HonestyHumility_24_R"
## [71] "ipip_Extroversion" "ipip_Agreeableness"
## [73] "ipip_Conscientiousness" "ipip_Neuroticism"
## [75] "ipip_Openness" "ipip_Honesty"
## [77] "crowds_1" "crowds_2"
## [79] "crowds_3" "Crowd_Preference_Mean"
## [81] "Age" "Background_Architecture"
## [83] "Background_Arts" "Background_Rural"
## [85] "Grow.up.country" "Grow.up.City"
## [87] "Lives_now" "years_Lives_now"
## [89] "NSS" "Condition"
## [91] "screen_width" "screen_height"
video_data <- read_csv(paste0(folder,"/3_materials/stimuli_scripts/final_set_with_testing_groups_typology_220914.csv"))
## Rows: 480 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): videoname, name_df, name_df_keep, link, quality, rendition, Countr...
## dbl (8): fps, duration, Cluster, testing_group, n_frames, mean_pedcounts, m...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
length(unique(video_data$videoname))
## [1] 480
# add a consistent prefix on columns related to the stimuli (aka 'video')
video_data <- video_data %>% rename_with(.cols = everything(), function(x){
ifelse(str_starts(x, "video|video_"),
paste0("video_", str_remove(x, "video|video_")),
paste0("video_", x)) }) # v.1.0.4.
names(video_data)
## [1] "video_name" "video_name_df"
## [3] "video_name_df_keep" "video_link"
## [5] "video_quality" "video_rendition"
## [7] "video_fps" "video_duration"
## [9] "video_Country" "video_Cluster"
## [11] "video_City" "video_testing_group"
## [13] "video_primary_category" "video_secondary_category"
## [15] "video_n_frames" "video_mean_pedcounts"
## [17] "video_max_pedcounts" "video_sum_pedcounts"
A total of 408 completed the study in prolific, and watched a total of 480. Many of them faced streaming issues and had low attention scores, let’s develop a strategy to exlude them from analysis.
df <- df %>%
left_join(video_data, by = c("Video" = "video_link"))
As an indirect measure of attention, during the experiment, we measured the duration (if any) that participants spent interacting with other programs in their computer. Browsers’ javascript API includes the events focus and blur which are fired when the user interacts with the browser. In our case, when a participant clicks on a different browser tab, browser window, or different application, a new row is added in the log with an event = “blur”; when they return to the experiment another event = “focus” is added to the experiment log. We then calculated distracted time specifically for the video presentation trials, as the time interval between a blur and the next focus event, or between a blur and the end a trial (this was necessary for a few cases where the participant returned to the experiment after a video had finished playing).
Below we inspect the data and decide on a filtering strategy.
We select all video (stimulus presentation) trials that distraction is more than 0, and convert milliseconds to seconds. We can observe there are 149 trials of some distraction out of 12240 total trials.
df |>
filter(Distracted_Time > 0) |>
mutate(Distracted_Time = Distracted_Time/1000,
Distracted_Time= round(Distracted_Time, digits = 0)) |>
arrange(Distracted_Time) |>
ggplot( aes(cumsum(Distracted_Time), Distracted_Time)) +
geom_line() +
geom_point() +
labs(title= "Seconds spent away from the experiment window during video trials", x = "Ascending values")
Second, we should examine if these ‘moments of distraction’ happen randomly across participants, or if some participants are consistently not paying attention to the task. So we here we count in how many trials per participant distracted time is above 0. We can see that while some participants are distracted once or twice, others are distracted for 7 or even 27 out 30 video trials.
df |>
filter(Distracted_Time > 0) |>
group_by(Participant) |>
count() |>
arrange(n) |>
pull(n)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3
## [26] 3 5 5 6 6 7 7 13 13 22 27
Let’s look more closely at those participants with no more than 2 distractions. How long where they? They seem to range from minimal (1 second) to severe (5 minutes or 320 seconds).
df |>
filter(Distracted_Time > 0) |>
group_by(Participant) |>
add_count() |>
filter( n < 3) |>
mutate(Distracted_Time = Distracted_Time/1000,
Distracted_Time= round(Distracted_Time, digits = 0)) |>
arrange(Distracted_Time) |>
pull(Distracted_Time)
## [1] 0 1 1 2 2 2 3 3 4 4 4 5 5 6 7 7 9 13 16
## [20] 17 18 18 20 24 25 25 31 32 35 62 101 320
We cannot really know why participants chose to look away from the experiment. However, we know each video should play for 30 seconds, and it can take some seconds to buffer based on the internet connection. We also know that when the connection is bad, a video can take quite some time to load, which could explain why participants switch away from the experiment.
df |>
filter(Distracted_Time > 0) |>
mutate(Distracted_Time = Distracted_Time/1000,
Distracted_Time= round(Distracted_Time, digits = 0)) |>
group_by(Participant) |>
add_count() |>
filter( n < 3) |>
ggplot(aes(trial_duration , Distracted_Time)) +
geom_point() + ggtitle("Is distraction time explained by slow video buffering?")
It’s quite unclear. Even for videos with fast buffering (around 30+ seconds for the entire video trial), participants spend some time distracted.
Based on the audit above, we concluded the following filtering strategy:
df_pruned <- df |>
group_by(Participant) |>
mutate( Distracted_n = sum(Distracted_Time > 0)) |>
filter( Distracted_n < 3) |> # remove participants who were distracted at all for more than 2 videos
mutate( Distracted_Time = Distracted_Time/1000,
Distracted_Time= round(Distracted_Time, digits = 0)) |>
filter( Distracted_Time == 0) # keep only reponses with 0 distraction time
n_distinct(df_pruned$Participant)
## [1] 395
df_pruned %>%
nrow()
## [1] 11819
So after this pruning, we are left with 395 participants.
A second concern we have is if participants responded intentionally, or just ‘clicked-away’… We will assess this by looking at the variance per response item, as well as the deviation from the average.
df_pruned |>
group_by(Participant) |>
summarise_at(vars(3:13), function(x) sd(x, na.rm = TRUE)) %>%
pivot_longer(cols = 2:12) |>
ggplot(aes(value, Participant, colour = value < 0.25)) +
geom_point() +
facet_grid(.~name) +
guides( y = "none")
In the figure above, we can observe that some participants have very low variance on some of the scales. One exception is Presence where many participants don’t have a large variance, but that is to be expected, we have included this parameter as a manipulation check and we expect presence to be stable across trials.
Now we will calculate the average SD across scales for each participant.
df_pruned |>
group_by(Participant) |>
summarise_at(vars(3:13), function(x) sd(x, na.rm = TRUE)) %>%
pivot_longer(cols = 2:12) |>
group_by(Participant) %>%
summarise(av_value = mean(value)) %>% arrange(av_value) %>%
ggplot(aes( av_value, reorder(Participant, av_value)))+ guides(y = "none") + lims(x = c(0, 7)) +
geom_point()
Here we see that 3 participants specifcally, have consistently low variance in their responses. We will interepret this as being a sign of low-effort and we will remove them.
# save the ids separately
low_effort_ids <- df |>
group_by(Participant) |>
summarise_at(vars(3:13), function(x) sd(x, na.rm = TRUE)) %>%
pivot_longer(cols = 2:12) |>
group_by(Participant) %>%
summarise(av_value = mean(value)) %>% arrange(av_value) %>%
filter(av_value < 0.75) %>%
pull(Participant)
length(low_effort_ids)
## [1] 3
We should also remove participant that had severe video streaming problems as this might have influenced their attention and/or appraisals.
df_pruned %>%
ungroup() %>%
# filter(trial_duration > 30) %>%
mutate(duration = if_else(video_duration > 30, 30, video_duration)) %>% # some videos are in fact longer than 30 seconds, but playback was capped at 30 seconds
mutate(Buffering = trial_duration - video_duration) %>%
arrange(Buffering) %>%
ggplot(aes(cumsum(Buffering), Buffering, colour = Buffering > 30)) +
geom_point() +
guides(x = "none") +
labs(title = "Video buffering time per trial" ,subtitle = "Buffering = Playback duration - Video duration")
In the figure above we see that a number of trials show buffering less
than 0, which means the trial_duration < video_duration, that
indicates some kind of issues with the stimuli presentation. We remove
these. We could also be concerned that when the videos take longer to
load, participants ‘sense of presence’ dips. Visual inspection (below)
suggests that is not the case.
ggplot(df_pruned, aes(trial_duration, Presence) ) +
geom_jitter() + geom_smooth(method = "lm") +
geom_vline(xintercept = 60, colour = "red", linetype = "dashed") +
theme_classic() +
labs(title = "Does sense of presence drop with longer video loading times?", subtitle = "It does not like this is an issue")
## `geom_smooth()` using formula = 'y ~ x'
We should also test if some participants had too many streaming problems which could also influence their responses.
df_pruned %>%
group_by(Participant) %>%
tally( ) %>% arrange(n) %>%
count(n)
## Storing counts in `nn`, as `n` already present in input
## ℹ Use `name = "new_name"` to pick a new name.
Finally, we decided to keep videos where excess time (buffering) was less than 2/3 of the video duration, i.e. no more than 20 seconds. This leads to a total of 50 seconds for the whole video duration.
previous_video_count = nrow(df)
df_pruned <- df_pruned |>
filter(! Participant %in% low_effort_ids)
We define as buffering time. the excess time of each trial, in addition to the duration of the video, in other words time that is attributed to loading the video to the participant’s browser.
\[ (1)~ video~duration + buffering = video~time \] \[ (2)~ buffering = video~time - video~duration \]
Now we will exclude videos:
df_pruned <- df_pruned |>
mutate(video_duration = if_else(video_duration > 30, 30, video_duration),
Buffering = trial_duration - video_duration) |>
filter(Buffering < video_duration,
video_duration < trial_duration)
n_distinct( df_pruned$Participant )
## [1] 387
One last step, if we have too few trials from a participant, we should also exclude their responses. We create a summary table, showing how many trials we have per participant. We see that 310 participants have full sets (30 trials), but 10 participants have 14 or less trials left after removing videos with streaming issues.
df_pruned %>%
group_by(Participant) %>%
tally() %>%
arrange(n) %>% pull(n) %>% table(.)
## .
## 4 7 9 10 11 13 14 15 16 18 19 21 23 24 25 26 27 28 29 30
## 1 1 2 1 3 1 1 1 1 1 1 2 1 3 2 1 7 16 31 310
We keep data from participants with 15 or more valid trials.
ids_to_keep <- df_pruned %>%
group_by(Participant) %>%
tally() %>%
arrange(n) %>%
filter(n >=15) %>%
pull(Participant)
length(ids_to_keep)
## [1] 377
df_pruned <- df_pruned |>
filter(Participant %in% ids_to_keep)
With this step we dropped -1123 videos.
Following these filtering steps, we now have retained 377 participants out of 408, and we have have kept 11117 video trials, out of 12240.
df_pruned %>%
nrow()
## [1] 11117
length(unique(df$Participant)) # original completed participants
## [1] 408
length(unique(df_pruned$Participant)) # original completed participants
## [1] 377
df_pruned %>%
group_by(Video) %>%
tally(name = "n_per_video") %>%
arrange(n_per_video) %>%
ungroup() %>%
summarise(av_n_per_video = mean(n_per_video),
sd = sd(n_per_video),
range = paste(range(n_per_video), collapse = "-"))
Keep columns we will use later.
names(df_pruned)
## [1] "Video" "Participant"
## [3] "Restoration" "Presence"
## [5] "WillingnessToWalk" "Beauty"
## [7] "Structure" "Interest"
## [9] "Familiarity" "Scenic"
## [11] "Crowdedness" "Width"
## [13] "Valence" "Arousal"
## [15] "trial_duration" "Distracted_Time"
## [17] "stim_screen_1" "stim_screen_2"
## [19] "stim_screen_3" "stim_screen_4"
## [21] "stim_screen_5" "stim_screen_6"
## [23] "stim_screen_7" "stim_screen_8"
## [25] "stim_screen_9" "stim_screen_10"
## [27] "SSA_Mean" "covid_1"
## [29] "covid_2" "covid_3"
## [31] "covid_4" "Concern_Covid_Mean"
## [33] "sias1" "sias2"
## [35] "sias3" "sias4"
## [37] "sias5" "sias6"
## [39] "SIAS_Total_Mean" "sps1"
## [41] "sps2" "sps3"
## [43] "sps4" "sps5"
## [45] "sps6" "SPS_Total_Mean"
## [47] "Extraversion_1" "Agreeableness_2"
## [49] "Conscientiousness_3" "Neuroticism_4"
## [51] "Openness_5" "Conscientiousness_10"
## [53] "Agreeableness_14" "Neuroticism_16"
## [55] "Extraversion_23" "HonestyHumility_6_R"
## [57] "Extraversion_7_R" "Agreeableness_8_R"
## [59] "Openness_9_R" "Conscientiousness_11_R"
## [61] "HonestyHumility_12_R" "Openness_13_R"
## [63] "Neuroticism_15_R" "Neuroticism_17_R"
## [65] "HonestyHumility_18_R" "Extraversion_19_R"
## [67] "Agreeableness_20_R" "Openness_21_R"
## [69] "Conscientiousness_22_R" "HonestyHumility_24_R"
## [71] "ipip_Extroversion" "ipip_Agreeableness"
## [73] "ipip_Conscientiousness" "ipip_Neuroticism"
## [75] "ipip_Openness" "ipip_Honesty"
## [77] "crowds_1" "crowds_2"
## [79] "crowds_3" "Crowd_Preference_Mean"
## [81] "Age" "Background_Architecture"
## [83] "Background_Arts" "Background_Rural"
## [85] "Grow.up.country" "Grow.up.City"
## [87] "Lives_now" "years_Lives_now"
## [89] "NSS" "Condition"
## [91] "screen_width" "screen_height"
## [93] "video_name" "video_name_df"
## [95] "video_name_df_keep" "video_quality"
## [97] "video_rendition" "video_fps"
## [99] "video_duration" "video_Country"
## [101] "video_Cluster" "video_City"
## [103] "video_testing_group" "video_primary_category"
## [105] "video_secondary_category" "video_n_frames"
## [107] "video_mean_pedcounts" "video_max_pedcounts"
## [109] "video_sum_pedcounts" "Distracted_n"
## [111] "Buffering"
df_pruned <- df_pruned %>%
select(Participant, Restoration, Presence, WillingnessToWalk, Beauty, Structure, Interest, Familiarity, Scenic, Crowdedness, Width, Valence, Arousal, trial_duration, Distracted_Time, SSA_Mean, Concern_Covid_Mean, SIAS_Total_Mean, SPS_Total_Mean, ipip_Extroversion, ipip_Agreeableness, ipip_Conscientiousness, ipip_Neuroticism, ipip_Openness, ipip_Honesty, Crowd_Preference_Mean, NSS, Condition, screen_width, screen_height, video_name, video_name_df, video_name_df_keep, video_quality, video_rendition, video_fps, video_duration, video_Country, video_Cluster, video_City, video_testing_group, video_primary_category, video_secondary_category, video_n_frames, video_mean_pedcounts, video_max_pedcounts, video_sum_pedcounts, Distracted_n, Buffering)
demographics <- read_csv(paste0(folder, "/9_Analysis/Online Study/data/demographics_merged.csv"))
## New names:
## Rows: 430 Columns: 22
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (18): Participant, Arts_background, ArtsBg, ArchitectureBackground, Arch... dbl
## (3): Age, years_Lives_now, Approval_rate lgl (1): ...21
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...21`
demographics <- demographics %>%
filter(!is.na(Participant)) %>%
mutate(
Employment = if_else(Employment_status %in% c("Full-time", "Part-time") | Student_status == "Yes", "Working/Studying", "Other (e.g., unemployed, retired)"),
Age_breaks = cut(Age, breaks = c(17.9, 30, 50, 65, 100), labels = c("18–29", "30-49", "50-64", "65+") ),
Education = if_else(ArchBg == "Yes", "Architecture", if_else(ArtsBg == "Yes", "Arts", "Other")),
Upbringing_Urban= if_else(tolower(urban_rural_background) == "urban", "Urban", "Other"),
Upbringing_Rural= if_else(tolower(urban_rural_background) == "rural", "Rural", "Other"),
Current_Urban = if_else(tolower(urban_rural_now) == "urban", "Urban", "Other")) %>%
mutate(
Upbringing_Urban = factor(Upbringing_Urban, levels = c("Urban", "Other")),
Current_Urban = factor(Current_Urban, levels = c("Urban", "Other"))
)
demographics <- demographics %>%
select(Participant, Sex, Age, Language, ArtsBg, ArchBg, Lives_now, years_Lives_now, Language, Employment_status, Nationality, Approval_rate, Ethnicity, Student_status, Employment, Age_breaks, Education, Upbringing_Urban, Upbringing_Rural, Current_Urban)
length(unique(demographics$Participant))
## [1] 389
df_pruned <- df_pruned %>%
left_join(demographics, by = "Participant") %>%
ungroup()
n_distinct(df_pruned$Participant)
## [1] 377
# skimr::skim(df_pruned)
We will replace the prolific ids with an alphanumeric code, to release this data to open acces (OSF, etc).
new_ids <-
df_pruned %>%
select(Participant) %>%
distinct(Participant) %>%
mutate(Anonymised_ID = paste0(sample(1:length(unique(df_pruned$Participant)), replace = F), sample(LETTERS, replace = T), sample(LETTERS, replace = T)))
new_ids %>%
distinct(Anonymised_ID)
df_pruned <- df_pruned %>%
left_join(new_ids, by = "Participant") %>% # add anonymised ID based on Prolific ID
relocate(Anonymised_ID) %>%
select(-Participant) # remove Prolific ID
write_csv(df_pruned, file = paste0(folder, "/9_analysis/Online Study/data/data_pruned_for_analysis_", Sys.Date(), ".csv"))